###============================================================================###
#================================================================================#
#Detecting Misinformation: Identifying False News Spread by Political Leaders in the Global 
#Code to create final-post-dat.rds and final-politician-dat.csv from original 
#social media data
#================================================================================#
###============================================================================###

#load packages
pkgs <- c("tidyverse", "readxl")
sapply(pkgs, require, character.only = TRUE)

#================================================================================#
#================================================================================#
# final-post-dat.rds: a post level dataframe with all posts information and a binary 
#indicator for whether it was flagged as false by one of five detection approaches:
# (1) text-based approach; (2) domain-based approach; (3) FB URL approach; 
# (4) repeat offenders approach; (5) GDI approach
#****DUE TO THE PRIVACY RESTRICTIONS WE ARE UNABLE TO SHARE POST LEVEL DATA
#================================================================================#
#================================================================================#


#================================================================================
#Step 1: Load all SM data by platform, keeping only columns needed for analysis
#================================================================================

#load all social media data
all_facebook <- read_csv("Data/full-fb-df.csv")%>%
  distinct(URL, .keep_all = TRUE) %>%
  dplyr::select(politician_name, "user_name" = User.Name, month, year, `date-time`, "url_platform" = URL, all_text, "total_reactions" = Total.Interactions, "total_likes" = Likes,
         "total_comments" = Comments, "total_shares" = Shares, "total_love" = Love, "total_wow" = Wow, "total_haha" = Haha, 
         "total_sad" = Sad, "total_angry"= Angry, "total_cares" = Care, "total_views" = Total.Views) %>%
  mutate(platform = "Facebook") %>%
  separate(`date-time`, "date", sep = " ")

all_instagram <- read_csv("Data/full-ig-df.csv")%>%
  distinct(URL, .keep_all = TRUE) %>%
  dplyr::select(politician_name, "user_name" = User.Name, month, year, `date-time`,  "url_platform" = URL, all_text, "total_reactions" = Total.Interactions, 
                "total_likes" = Likes, "total_comments" = Comments, "total_views" = Views) %>%
  mutate(platform = "Instagram")%>%
  separate(`date-time`, "date", sep = " ")

all_twitter <- read_csv("Data/full-twitter-df.csv") %>% 
  distinct(url, .keep_all = TRUE) %>%
  dplyr::select(politician_name, "user_name" = handle, month, year, `date-time`, "url_platform" = url, "all_text" = text, public_metrics) %>%
  separate(`date-time`, "date", sep = "T") %>%
  #separate the public_metrics column into four types of metrics by the comma
  separate(col = public_metrics, into = c("total_shares","total_comments",
                                          "total_likes", "total_quotes"), sep = ",") %>%
  #remove special characters from text that will not be read properly by a gsub
  mutate(total_shares = str_replace_all(total_shares, "[[:punct:]]", ""),
         total_quotes = str_replace_all(total_quotes, "[[:punct:]]", "")) %>%
  # remove the following text pattern with no replacements on the column "total_shares"
  mutate(total_shares = as.numeric(gsub("retweetcount ", "",total_shares)),
         # remove the following text pattern with no replacements on the column "total_comments"
         total_comments = as.numeric(gsub(" 'reply_count': ", "",total_comments)),
         # remove the following text pattern with no replacements on the column "total_likes"
         total_likes = as.numeric(gsub(" 'like_count': ", "",total_likes)),
         # remove the following text pattern with no replacements on the column "total_quotes"
         total_quotes = as.numeric(gsub(" quotecount ", "",total_quotes)),
         total_reactions = total_shares + total_comments + total_likes + total_quotes)  %>%
  mutate(platform = "Twitter")

sum(nrow(all_facebook), nrow(all_instagram), nrow(all_twitter)) #get total number of posts across platforms

#================================================================================
#Step 2: Load classification output
#================================================================================

fb_classification <-read_rds("Data/pred-prob-fb.rds")%>%
  distinct(URL, .keep_all = TRUE) %>%
  dplyr::select("url_platform" = URL) %>%
  mutate(pred_prob = 1)

ig_classification <-read_rds("Data/pred-prob-ig.rds")%>%
  distinct(URL, .keep_all = TRUE) %>%
  dplyr::select("url_platform" = URL)%>%
  mutate(pred_prob = 1)

twitter_classification <-read_rds("Data/pred-prob-twitter.rds")%>%
  distinct(url, .keep_all = TRUE) %>%
  dplyr::select("url_platform" = url)%>%
  mutate(pred_prob = 1)

sum(nrow(fb_classification), nrow(ig_classification), nrow(twitter_classification)) #get total number of posts with pred prob > .9 across platforms

#================================================================================
#Step 4: Load cosine output
#================================================================================

fb_cosine <- read_csv("Data/boatos-fb-tfidf-cosine.csv") %>%
  filter(max_cosine_tfidf > .4)%>%
  dplyr::select("url_platform" = URL) %>%
  mutate(cosine_above = 1)

ig_cosine <- read_csv("Data/boatos-ig-tfidf-cosine.csv")%>%
  filter(max_cosine_tfidf > .4)%>%
  dplyr::select("url_platform" = URL) %>%
  mutate(cosine_above = 1)

twitter_cosine <- read_csv("Data/boatos-twitter-tfidf-cosine.csv")%>%
  distinct(`url.x`, .keep_all = TRUE) %>%
  filter(max_cosine_tfidf > .4)%>%
  dplyr::select("url_platform" = `url.x`) %>%
  mutate(cosine_above = 1)

sum(nrow(fb_cosine), nrow(ig_cosine), nrow(twitter_cosine)) #get total number of posts with cosine similarity > .4 across platforms

#================================================================================
#Step 3: Merge Facebook data by approach
#================================================================================

#Text-approach - ##initial data from final human review step of text-based approach
fake_posts_facebook <- read_xlsx("Data/fake-all-sm-text-approach.xlsx", sheet = 2) %>%
  filter(final_broader == "match") %>%
  dplyr::select(user_name, url_platform) %>%
  mutate(fake_post = 1) # dummy variable for text approach

#Domain-approach - ##initial data from final string match of domain list from Facebook URLs data
facebook_url_fake <- read_csv("Data/fake-facebook-urls-approach.csv")%>%
  dplyr::select("user_name" = `User.Name`, "url_platform" = URL) %>%
  mutate(base_url_fake = 1) # dummy variable for domain approach

#Full FB URL approach - ##initial data from SS1 URLs dataset
full_fake_df_full_url_fb <- read_csv("Data/fake-all-sm-fb-urls-approach.csv") %>%
  filter(full_fake_url > 0) %>%
  filter(platform == "Facebook") %>%
  dplyr::select("user_name" = `User.Name`, "url_platform" = URL) %>%
  mutate(full_fake_url = 1) # dummy variable for fb url approach

#Repeat offenders approach - ##initial data from final string match off of domain list from Facebook URLs data with two or more unique URLs
facebook_repeat <- read_csv("Data/fake-facebook-repeat-offenders-url-approach.csv")%>%
  dplyr::select("user_name" = `User.Name`, "url_platform" = URL) %>%
  mutate(fake_url_repeat_offenders = 1) # dummy variable for repeat approach

#GDI approach - ##initial data from  string match off of domain list from GDI
facebook_gdi <- read_csv("Data/fake-facebook-gdi-approach.csv")%>%
  dplyr::select("user_name" = `User.Name`, "url_platform" = URL) %>%
  mutate(fake_url_gdi = 1)

all_facebook_fake <- left_join(all_facebook, fb_classification) %>% #classification > .9
  left_join(., fb_cosine) %>% #cosine similarity > .4
  left_join(., fake_posts_facebook) %>% #domain dummy
  left_join(., facebook_url_fake) %>% #url dummy
  left_join(., full_fake_df_full_url_fb) %>% #full fb url dummy
  left_join(., facebook_repeat) %>% #repeat offenders dummy
  left_join(., facebook_gdi) #gdi dummy

#check
sum(all_facebook_fake$pred_prob, na.rm = T)
sum(all_facebook_fake$cosine_above, na.rm = T)
sum(all_facebook_fake$fake_post, na.rm = T)
sum(all_facebook_fake$base_url_fake, na.rm = T)
sum(all_facebook_fake$full_fake_url, na.rm = T)
sum(all_facebook_fake$fake_url_repeat_offenders, na.rm = T)
sum(all_facebook_fake$fake_url_gdi, na.rm = T)

#================================================================================
#Step 3: Merge Instagram data by approach
#================================================================================

#Text-approach
fake_posts_instagram <- read_xlsx("Data/fake-all-sm-text-approach.xlsx", sheet = 1) %>%
  filter(final_broader == "match") %>%
  dplyr::select(user_name, url_platform) %>%
  mutate(fake_post = 1) # dummy variable for text approach

#Domain-approach
instagram_url_fake <- read_csv("Data/fake-instagram-urls-approach.csv")%>%
  dplyr::select("user_name" = `User.Name`, "url_platform" = URL) %>%
  mutate(base_url_fake = 1) # dummy variable for domain approach

#Full FB URL approach - NO IG DATA

#Repeat offenders approach
instagram_repeat <- read_csv("Data/fake-instagram-repeat-offenders-url-approach.csv")%>%
  dplyr::select("user_name" = `User.Name`, "url_platform" = URL) %>%
  mutate(fake_url_repeat_offenders = 1) # dummy variable for repeat approach

#GDI approach 
instagram_gdi <- read_csv("Data/fake-instagram-gdi-approach.csv")%>%
  dplyr::select("user_name" = `User.Name`, "url_platform" = URL) %>%
  mutate(fake_url_gdi = 1)

all_instagram_fake <- left_join(all_instagram, ig_classification) %>% #classification > .9
  left_join(., ig_cosine) %>% #cosine similarity > .4
  left_join(., fake_posts_instagram) %>% #domain dummy
  left_join(., instagram_url_fake) %>% #url dummy
  left_join(., instagram_repeat) %>% #repeat offenders dummy
  left_join(., instagram_gdi) #gdi dummy

#check
sum(all_instagram_fake$pred_prob, na.rm = T)
sum(all_instagram_fake$cosine_above, na.rm = T)
sum(all_instagram_fake$fake_post, na.rm = T)
sum(all_instagram_fake$base_url_fake, na.rm = T)
sum(all_instagram_fake$fake_url_repeat_offenders, na.rm = T)
sum(all_instagram_fake$fake_url_gdi, na.rm = T)

#================================================================================
#Step 4: Merge Twitter data by approach
#================================================================================

#Text-approach
fake_posts_twitter <- read_xlsx("Data/fake-all-sm-text-approach.xlsx", sheet = 3) %>%
  filter(final_broader == "match") %>%
  dplyr::select(user_name, url_platform) %>%
  mutate(fake_post = 1) # dummy variable for text approach

#Domain-approach 
twitter_url_fake <- read_csv("Data/fake-twitter-urls-approach.csv")%>%
  dplyr::select("user_name" = handle, "url_platform" = url) %>%
  mutate(base_url_fake = 1) # dummy variable for domain approach

#Full FB URL approach 
full_fake_df_full_url_twitter <- read_csv("Data/fake-all-sm-fb-urls-approach.csv") %>%
  filter(full_fake_url > 0) %>%
  filter(platform == "Twitter") %>%
  dplyr::select("user_name" = `User.Name`, "url_platform" = URL) %>%
  mutate(full_fake_url = 1) # dummy variable for fb url approach

#Repeat offenders approach
twitter_repeat <- read_csv("Data/fake-twitter-repeat-offenders-url-approach.csv")%>%
  dplyr::select("user_name" = handle, "url_platform" = url) %>%
  mutate(fake_url_repeat_offenders = 1) # dummy variable for repeat approach

#GDI approach 
twitter_gdi <- read_csv("Data/fake-twitter-gdi-approach.csv")%>%
  dplyr::select("user_name" = handle, "url_platform" = url) %>%
  mutate(fake_url_gdi = 1)

all_twitter_fake <- left_join(all_twitter, twitter_classification) %>% #classification > .9
  left_join(., twitter_cosine) %>% #cosine similarity > .4
  left_join(., fake_posts_twitter) %>% #domain dummy
  left_join(., twitter_url_fake) %>% #url dummy
  left_join(., full_fake_df_full_url_twitter) %>% #full fb url dummy
  left_join(., twitter_repeat) %>% #repeat offenders dummy
  left_join(., twitter_gdi) #gdi dummy

#check
sum(all_twitter_fake$pred_prob, na.rm = T)
sum(all_twitter_fake$cosine_above, na.rm = T)
sum(all_twitter_fake$fake_post, na.rm = T)
sum(all_twitter_fake$base_url_fake, na.rm = T)
sum(all_twitter_fake$full_fake_url, na.rm = T)
sum(all_twitter_fake$fake_url_repeat_offenders, na.rm = T)
sum(all_twitter_fake$fake_url_gdi, na.rm = T)

#================================================================================
#Step 3: Join all detection approaches together
#================================================================================

#combine all detection approaches together

#add all posts together with fake posts by detection
all_posts <- full_join(all_twitter_fake, all_instagram_fake) %>%
  full_join(., all_facebook_fake) %>%
  distinct(url_platform, .keep_all = TRUE) %>%
  mutate_at(vars(pred_prob, cosine_above, fake_post, base_url_fake, full_fake_url,fake_url_repeat_offenders,
                 fake_url_gdi), ~replace_na(., 0))
  
#check to make sure tallies add up
all_posts %>%
  dplyr::select(pred_prob, cosine_above, fake_post, base_url_fake, full_fake_url, fake_url_repeat_offenders, fake_url_gdi) %>%
  summarize_all(sum)

#save post level data
saveRDS(all_posts, "Data/final-post-dat.rds")

#================================================================================
#POLITICIAN LEVEL DATASET: final-politician-dat.csv
#a politician level dataframe with all posts information and binary and count columns
# for whether the politicians' posts were flagged as false by one of five detection 
#approaches: (1) text-based approach; (2) domain-based approach; (3) FB URL approach; 
# (4) repeat offenders approach; (5) GDI approach
#================================================================================

#load post data
all_posts <- readRDS("Data/final-post-dat.rds")

pols <- all_posts %>%
  dplyr::select(-c(pred_prob, cosine_above, user_name, url_platform, date, month, year, all_text)) %>%
  group_by(politician_name) %>%
  dplyr::summarize(total_posts = n(),
            total_fake_posts = sum(fake_post), #text approach
            fake_dummy = if_else(total_fake_posts > 0, 1, 0),
            total_fake_posts_url = sum(base_url_fake),  #domain approach
            fake_dummy_url = if_else(total_fake_posts_url > 0, 1, 0),
            full_fb_url = sum(full_fake_url), #full FB url approach
            full_fb_url_dummy = if_else(full_fb_url > 0, 1, 0),
            total_fake_posts_url_repeat_offender = sum(fake_url_repeat_offenders), #repeat offenders approach
            fake_dummy_url_repeat_repeat_offender = if_else(total_fake_posts_url_repeat_offender > 0, 1, 0),
            total_fake_posts_gdi = sum(fake_url_gdi), #gdi approach
            fake_dummy_gdi = if_else(total_fake_posts_gdi > 0, 1, 0))

#check to make sure tallies add up post level
pols %>%
  dplyr::select(total_posts, total_fake_posts, total_fake_posts_url, full_fb_url, total_fake_posts_url_repeat_offender,
                total_fake_posts_gdi) %>%
  summarize_all(sum)


#join with demographic info about pols collected from TSE
pol_data <- read_csv("Data/politician-demographics.csv") %>%
  dplyr::select(pol_id, politician_name, age, sex_imputed, educ, elec_coalition, ideo_alt, ideology_median, ideology_extreme, 
         ideology_extreme_levels,pol_exp, position)

final_pols <- left_join(pols, pol_data) %>% #merge on politician name
  dplyr::select(-politician_name) #remove politician name


#check to make sure tallies add up at pol level
final_pols %>%
  dplyr::select(fake_dummy, fake_dummy_url, full_fb_url_dummy, fake_dummy_url_repeat_repeat_offender, fake_dummy_gdi,
                total_posts, total_fake_posts, total_fake_posts_url, full_fb_url, total_fake_posts_url_repeat_offender,
                total_fake_posts_gdi) %>%
  summarize_all(sum)

#save politician level dataframe
saveRDS(final_pols, "Data/final-politician-dat.rds")
